---
#title: "Replication code for Historical Political Economy: Past, Present, and Future."
#authors: Volha Charnysh, Eugene Finkel, and Scott Gehlbach
#date: October 2022

#R version: 3.6.3 (2020-02-29) 
#Platform: x86_64-apple-darwin15.6.0 (64-bit)
---
  
library(tidyverse)
library(psych)
library(openxlsx)
library(ggsci)
library(viridis)
library(wordcloud2)
library(RColorBrewer)
library(tm)

# Import Data
hpe <- read.xlsx("HPE MAIN_TABLE_Sep22.xlsx", sheet ="HPE Articles") 
table(hpe$HPE) 
table(hpe$Year) 

prop.table(table(hpe$Single.country)) #N=160, 67%.
prop.table(table(hpe$Formal.model.or.not==1)) #11%


## Figure 1: share of HPE articles from total.

count <- read.xlsx("HPE MAIN_TABLE_Sep22.xlsx", sheet = "annual counts")
gathered_year <- data.frame(table(hpe$Year))
names(gathered_year) <- c("Year", "Count")
gathered_year2 <- merge(gathered_year, count, by="Year")
gathered_year2$share <- 100*(gathered_year2$Count/gathered_year2$Total)

total_share <- ggplot(data=gathered_year2, aes(x=Year, y=share, group = 1)) +
  geom_line() + ylim(0, 11)+
  geom_point() + labs(y = "Share of all articles (%)", x="Year of publication") + theme_bw() +
  theme(legend.position="none", plot.title = element_text(hjust = .5)) + theme(axis.text=element_text(size=12)) 

ggsave("graphs_pdf/TotShare.pdf", total_share, height = 6, width = 10) 

#Figure 2: share of HPE articles by journal.

data_gathered <- hpe %>%
  gather(Journal, value, 2:4)

data_gathered <- data.frame(table(hpe$Journal))
names(data_gathered)<-c("Journal", "Count")

totbyjournal<-c(sum(count$AJPS), sum(count$APSR), sum(count$BJPS), sum(count$CP), sum(count$CPS), sum(count$JOP), sum(count$QJPS),  sum(count$WP))

data_gathered$TotalArts<-totbyjournal

data_gathered$Journal <- factor(data_gathered$Journal, levels = c("APSR", "AJPS", "JOP", "BJPS", "QJPS", "WP", "CPS", "CP"))
data_gathered$Share<-100*data_gathered$Count/data_gathered$TotalArts
share_arts_journal<-ggplot(data=data_gathered, aes(x=reorder(Journal, Share), y=Share, fill=Journal)) + 
  labs(y= "HPE as share of all articles in journal (%)", x="Journal")+ geom_bar(stat="identity") + scale_fill_grey()  + theme(axis.text=element_text(size=12))+
  theme_bw()+theme(legend.position = "None") 


ggsave("graphs_pdf/journalShare.pdf", share_arts_journal, height = 6, width = 10) #changed from plots


#Articles by subfield in general-interest journals
prop.table(table(hpe$Subfield[hpe$Journal %in% c("AJPS", "APSR", "JOP")]))


#Counting articles by region: 
prop.table(table(hpe$Region))

#Recoding a few categories
hpe$Region[hpe$Region == "Africa and Asia"] <- "World"
hpe$Region[hpe$Region == "Asia, Middle East"] <- "Asia"
hpe$Region[hpe$Region == "Europe, Africa, Asia, Middle East"] <- "World"

#data summary
prop.table(table(hpe$Country.if.single.country))
prop.table(table(hpe$Region))
table(hpe$Country.if.single.country)
table(hpe$Region[hpe$Single.country==0])

#Note proportion for Germany includes Germany alone and Germany, UK (Kasara and Mares)
#Note proportion for UK includes UK alone and Germany, UK (Kasara and Mares)

#creating a new data frame to classify all countries into regions
region_counts <- data.frame(Region = c('Africa', 'Asia', 'Europe (East)', 'Europe (West)', 
                                       'North America', 'Central/Latin America', 'Oceania', 'World'),
                            Counts = 0, Proportion = 0)

region_counts$Counts[which(region_counts$Region == 'Africa')] <- 
  sum(hpe$Country.if.single.country %in% c('Burundi','Cameroon', 'German East Africa', 'Ghana', 'Namibia', 'Senegal'))

region_counts$Counts[which(region_counts$Region == 'Asia')] <- 
  sum(hpe$Country.if.single.country %in% c('China','India', 'Indonesia',  'Philippines', 'Thailand'))

region_counts$Counts[which(region_counts$Region == 'Europe (East)')] <- 
  sum(hpe$Country.if.single.country %in% c('Belarus','Poland', 'Romania', 'Russia', 'Ukraine'))

region_counts$Counts[which(region_counts$Region == 'Europe (West)')] <- 
  sum(hpe$Country.if.single.country %in% c('Denmark','France', 'Germany', 'Italy', 'Netherlands', 'Norway', 'Spain', 'Switzerland', 'UK', 'Israel', 'UK, Germany'))

region_counts$Counts[which(region_counts$Region == 'North America')] <- 
  sum(hpe$Country.if.single.country %in% c('USA','Canada'))

region_counts$Counts[which(region_counts$Region == 'Central/Latin America')] <- 
  sum(hpe$Country.if.single.country %in% c('Argentina','Brazil', 'Chile', 'Dominican Republic', 'Paraguay', 'Mexico', 'Peru'))

region_counts$Counts[which(region_counts$Region == 'Oceania')] <- 
  sum(hpe$Country.if.single.country %in% c('New Zealand'))

region_counts$Counts[which(region_counts$Region == 'World')] <- 
  sum(hpe$Country.if.single.country == 'Multiple')

#checking that all countries were actually classified into a region
sum(region_counts$Counts) == nrow(hpe)

#adding proportion of articles from each region relative to all hpe articles
region_counts$Proportion <- region_counts$Counts/nrow(hpe)


#Understanding the average timespan covered by HPE articles that study persistence

#Recoding BC years (1) before calculating time span
hpe$Begin[which(hpe$Begin == "505 BC")] <- "-505"
hpe$Begin[which(hpe$Begin == "100 BC")] <- "-100"
hpe$Begin[which(hpe$Begin == "4000 BC")] <- "-4000"

hpe$Timespan<-as.numeric(hpe$End)-as.numeric(hpe$Begin)
summary(hpe$Timespan)

#Calculating average timespan for articles classified as "Understand the present":
summary(hpe$Timespan[str_detect(hpe$Type, "Understand the present")]) #Mean=226.59

## Summarizing different periods that articles focus on:
hpe$BeginCentury<-rep(NA, nrow(hpe))

hpe$BeginCentury[as.numeric(hpe$Begin)<1400] <-"Before 1400"
hpe$BeginCentury[as.numeric(hpe$Begin)>=1400 & as.numeric(hpe$Begin)<1789] <-"1400-1789"
hpe$BeginCentury[as.numeric(hpe$Begin)>=1789  & as.numeric(hpe$Begin)<=1914] <-"1789-1914" #Long Nineteenth
hpe$BeginCentury[as.numeric(hpe$Begin)>1914] <-"After 1914"

#Manual coding for a few articles where dates were imprecise but can be categorized into periods: 
hpe$BeginCentury[hpe$Title=="Redistributive Political Transitions: Minority Rule and Liberation Wars in Colonial Africa"]<-"After 1914" #decolonization
hpe$BeginCentury[hpe$Title=="Origins of Early Democracy"]<-"Before 1400" #Ahmed & Stasavage
hpe$BeginCentury[hpe$Title=="The Origins of Patronage Politics: State Building, Centrifugalism, and Decolonization"]<-"After 1914"
hpe$BeginCentury[hpe$Title=="A Reanalysis of the Relationship between Indirect Rule, Ethnic Inclusion, and Decolonization"]<-"After 1914" #decolonization
hpe$BeginCentury[hpe$Title=="Who Inherits the State? Colonial Rule and Postcolonial Conflict"]<-"After 1914" #decolonization

table(hpe$BeginCentury)

prop.table(table(hpe$BeginCentury)) 

#Any patterns by region?
prop.table(table(hpe$BeginCentury[hpe$Region=="Europe"])) 
prop.table(table(hpe$BeginCentury[hpe$Region=="World"])) 
prop.table(table(hpe$BeginCentury[hpe$Single.country==1])) 
prop.table(table(hpe$BeginCentury[hpe$Single.country==0])) 

#Calculating types of articles by the use of history

table(hpe$Type) 


hpe <- hpe %>% mutate(
  'Explore theory' = case_when(
    str_detect(Type, 'theory') ~ 1,
    TRUE ~ 0),
  'Understand the past' = case_when(
    str_detect(Type, 'past') ~ 1,
    TRUE ~ 0),
  'Understand the present' = case_when(
    str_detect(Type, 'present') ~ 1,
    TRUE ~ 0))


type_counts <- data.frame(Type = c('Explore theory','Understand the past','Understand the present'),
                          Count = c(sum(hpe$`Explore theory`), sum(hpe$`Understand the past`), 
                                    sum(hpe$`Understand the present`)))

#barplot showing counts of each type of article. Not in the paper.
types<-ggplot(data = type_counts, aes(x=Type, y=Count, fill = Type)) + 
  geom_bar(stat="identity") + geom_text(aes(label=Count), vjust=1.6, color="black", size=4) + 
  ggtitle(label='Number of articles by HPE type')

#Proportions by type calculated in the paper are based on type_counts divided by total number of articles. 
#Note some articles are in several categories.
type_counts$Proportion<-type_counts$Count/238

#Figure 3: keywords coded manually for each article as word cloud
#Source: https://www.jigsawacademy.com/how-to-create-a-word-cloud-in-r/#Main-steps-to-create-Word-Cloud-in-R
keyword_text <- hpe$word_cloud_terms
keyword_corpus <- Corpus(VectorSource(keyword_text))
keyword_corpus <- keyword_corpus %>% tm_map(removePunctuation) #%>% tm_map(stripWhitespace) 
keyword_corpus <- tm_map(keyword_corpus, removeWords, stopwords("english"))
#Remove punctuation, white space, common English words

keyword_dtm <- TermDocumentMatrix(keyword_corpus)
keyword_matrix <- as.matrix(keyword_dtm)
keyword_words <- sort(rowSums(keyword_matrix), decreasing = TRUE)
keyword_df <- data.frame(word = names(keyword_words), freq = keyword_words)

keyword_df_Sm<-subset(keyword_df, freq>2)
keyword_df_Sm$word<-as.character(keyword_df_Sm$word)
keyword_df_Sm$word[keyword_df_Sm$word=="statecapacity"]<-"state capacity"
keyword_df_Sm$word[keyword_df_Sm$word=="statebuilding"]<-"state building"
keyword_df_Sm$word[keyword_df_Sm$word=="propertyrights"]<-"property rights"
keyword_df_Sm$word[keyword_df_Sm$word=="economicdevelopment"]<-"economic development"
keyword_df_Sm$word[keyword_df_Sm$word=="politicaldevelopment"]<-"political development"
keyword_df_Sm$word[keyword_df_Sm$word=="socialnetworks"]<-"social networks"
keyword_df_Sm$word[keyword_df_Sm$word=="directrule"]<-"direct rule"
keyword_df_Sm$word[keyword_df_Sm$word=="politicalregime"]<-"political regime"
keyword_df_Sm$word[keyword_df_Sm$word=="welfarestate"]<-"welfare state"
keyword_df_Sm$word[keyword_df_Sm$word=="stateformation"]<-"state formation"
keyword_df_Sm$word[keyword_df_Sm$word=="proportionalrepresentation"]<-"PR"
keyword_df_Sm$word[keyword_df_Sm$word=="publicgoods"]<-"public goods"
keyword_df_Sm$word[keyword_df_Sm$word=="collectiveaction"]<-"collective action"

wc<-wordcloud2(keyword_df_Sm, color='random-dark', size=.6)
#Saved manually

##Figure 4: classifying HPE articles by type and topic.
table(hpe$topic)

hpe$violence<-ifelse(str_detect(hpe$topic, "conflict|violence|repression|war"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$violence) 

hpe$democratization<-ifelse(str_detect(hpe$topic, "democratization|regime change|democratic development|democratic constitutions|Democratization"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$democratization) 

hpe$autocracy<-ifelse(str_detect(hpe$topic, "authoritarian politics|political development|authoritarian rule|autocracy|monarchy|censorship|repression"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$autocracy) 

hpe$colonialism<-ifelse(str_detect(hpe$topic, "colonialism|imperialism|precolonial legacies"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$colonialism) 

hpe$state<-ifelse(str_detect(hpe$topic, "state capacity|taxation|state building|state development|state building|institutions|public goods|spending|state_capacity|treaties|trade"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$state) 

hpe$clientelism<-ifelse(str_detect(hpe$topic, "clientelism|governance|budget|corruption|patronage|rents|budgets"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$clientelism) 

hpe$legislature<-ifelse(str_detect(hpe$topic, "congress|legislature|party system|parties|party development|party competition|party"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$legislature) 

hpe$electoralrules<-ifelse(str_detect(hpe$topic, "proportional representation|electoral reform|electoral rules|voting|electoral institutions|democratic constitutions|presidency|supreme court|electoral systems|suffrage|voter|political participation|primary"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$electoralrules) 

hpe$migration<-ifelse(str_detect(hpe$topic, "migration|ethnic relations|ethnic identity|intergroup relations|immigration"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$migration) 

hpe$econdev<-ifelse(str_detect(hpe$topic, "economic development|economic prosperity|development|economic_development"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$econdev) 

hpe$social<-ifelse(str_detect(hpe$topic, "Collective action|social movements|civil rights|culture|inequality|media|religion|intergenerational transmission|social networks|welfare|pensions"), 1, ifelse(is.na(hpe$topic), NA, 0))
table(hpe$social) 


df2 <- data.frame(topic=rep(c("Violence and repression", "Democratization", "State building/capacity", "Colonialism", "Legislative and party politics", 
                              "Autocracy"), each=3),
                  Type=rep(c("Understand the past", "Understand the present", "Explore theory"), 6),
                  count=c(sum(hpe$violence[hpe$`Understand the past` == 1], na.rm=TRUE) , 
                          sum(hpe$violence[hpe$`Understand the present` == 1], na.rm=TRUE), 
                          sum(hpe$violence[hpe$`Explore theory` == 1], na.rm=TRUE),
                          sum(hpe$democratization[hpe$`Understand the past` == 1], na.rm=TRUE) , 
                          sum(hpe$democratization[hpe$`Understand the present` == 1], na.rm=TRUE), 
                          sum(hpe$democratization[hpe$`Explore theory` == 1], na.rm=TRUE),
                          sum(hpe$state[hpe$`Understand the past` == 1], na.rm=TRUE) , 
                          sum(hpe$state[hpe$`Understand the present` == 1], na.rm=TRUE), 
                          sum(hpe$state[hpe$`Explore theory` == 1], na.rm=TRUE),
                          sum(hpe$colonialism[hpe$`Understand the past` == 1], na.rm=TRUE), 
                          sum(hpe$colonialism[hpe$`Understand the present` == 1], na.rm=TRUE), 
                          sum(hpe$colonialism[hpe$`Explore theory` == 1], na.rm=TRUE),
                          sum(hpe$legislature[hpe$`Understand the past` == 1], na.rm=TRUE), 
                          sum(hpe$legislature[hpe$`Understand the present` == 1], na.rm=TRUE), 
                          sum(hpe$legislature[hpe$`Explore theory` == 1], na.rm=TRUE), 
                          sum(hpe$autocracy[hpe$`Understand the past` == 1], na.rm=TRUE), 
                          sum(hpe$autocracy[hpe$`Understand the present` == 1], na.rm=TRUE), 
                          sum(hpe$autocracy[hpe$`Explore theory` == 1], na.rm=TRUE)))

df2$Type<-as.character(df2$Type)


df2$Type<-factor(df2$Type, levels=c("Explore theory", "Understand the present", "Understand the past"))
df2$topic<-factor(df2$topic, levels=c("Legislative and party politics", "State building/capacity", "Colonialism", "Violence and repression", "Democratization",  "Autocracy"))


topics_type <- ggplot(data=df2, aes(x=topic, y=count, fill=Type)) +geom_bar(stat="identity")+xlab("Topics")+ylab("Number of articles")+ theme_minimal()+ theme(axis.text.x = element_text(angle = 10, size=13), axis.title=element_text(size=13), legend.text=element_text(size=13), legend.title=element_text(size=14))+scale_fill_grey()+ guides(fill=guide_legend(title="Use of history"))

ggsave("graphs_pdf/topics_type.pdf", topics_type, height = 6, width = 10) 

